
// Update: 2/11/20 -> 2015/2016 variables that are missing

cap log close
log using "$logs/cr-iat-data-set.log", replace

	// BASED OFF OF VERSION cr-iat-data-set-v10

********************************************************************************
* PART 1: Clean IAT data
********************************************************************************

// 2008-2013
forvalues y=2008/2013 {
	// input data
	use "$raw/iat/IAT`y'.dta", clear

	// basic cleaning
	assert regexm(study_name,"Demo.Race")==1
	
	foreach var in state countyno {
		replace `var' = trim(`var')
	}
	egen rm = rowmiss(state countyno)
	assert rm == 0 | rm == 2 
	keep if rm == 0	
	
	// Missingness
	sum month* year* tblack* twhite* *race* *ethnicity* *sex* *edu* occu* *age* num d_biep* 
	
	tab sex
	tab num
	
	// identify variables to keep
	keep month year /// need this to identify school year
		raceomb ethnicityomb sex edu edu_14 /// demos
		occu* age num /// subgroups
		d_biep_white_good_all tblack_0to10 twhite_0to10 /// outcomes
		state countyno
	
	// final prep
	destring *, replace
	save "$data/race_iat_public_clean_`y'.dta", replace
	}
	
	forvalues y=2008/2013 {
		use "$data/race_iat_public_clean_`y'.dta", clear
		mdesc
		}
	
// 2014
	// input data
	use "$raw/iat/IAT2014.dta", clear

	// basic cleaning
	assert regexm(study_name,"Demo.Race")==1
	
	foreach var in state countyno {
		replace `var' = trim(`var')
	}
	egen rm = rowmiss(state countyno)
	assert rm == 0 | rm == 2 
	keep if rm == 0	
	
	// Missingness
	sum month* year* tblack* twhite* *race* *ethnicity* *sex* *edu* occu* *age* num d_biep* 
	
	egen rm2 = rowmiss(tblack_0to10 tblack_1to11)
		assert rm2!=0
	egen rm3 = rowmiss(twhite_0to10 twhite_1to11)
		assert rm3!=0
		drop rm*
		replace tblack_0to10=tblack_1to11-1 if mi(tblack_0to10)
		replace twhite_0to10=twhite_1to11-1 if mi(twhite_0to10)
	
	// identify variables to keep
	keep month year /// need this to identify school year
		raceomb ethnicityomb sex edu edu_14 /// demos
		occu* age num /// subgroups
		d_biep_white_good_all tblack_0to10 twhite_0to10 /// outcomes
		state countyno
	
		mdesc
	
	// final prep
	destring *, replace
	save "$data/race_iat_public_clean_2014.dta", replace	
	
// 2015
	// input data
	use "$raw/iat/IAT2015.dta", clear

	// basic cleaning
	assert regexm(study_name,"Demo.Race")==1
	
	foreach var in state countyno {
		replace `var' = trim(`var')
	}
	egen rm = rowmiss(state countyno)
	assert rm == 0 | rm == 2 
	keep if rm == 0	
	
	// Missingness
	sum month* year* tblack* twhite* *race* *ethnicity* *sex* *edu* occu* *age* num d_biep* 
	
	egen rm2 = rowmiss(tblack_0to10 tblack_1to11)
		assert rm2!=0
	egen rm3 = rowmiss(twhite_0to10 twhite_1to11)
		assert rm3!=0
		drop rm*
		replace tblack_0to10=tblack_1to11-1 if mi(tblack_0to10)
		replace twhite_0to10=twhite_1to11-1 if mi(twhite_0to10)
		
	gen temp = "m" if inlist(sex_5,1,4)
		replace temp = "f" if inlist(sex_5, 2,3)
		tab temp sex
		replace temp = "f" if sex=="f"
		replace temp = "m" if sex == "m"
	drop sex
	rename temp sex
	
	egen rm2 = rowmiss(birth*)
	tab rm2
	drop rm2
	
		gen mdy=mdy(month,day,year)
		gen mdy2 = mdy(birthmonth,1,birthyear)
		gen diff=mdy-mdy2
		gen temp = diff/365
		replace age = temp if mi(age)
		drop mdy mdy2 diff temp
		
		egen rm2 = rowmiss(edunotstudent edustudent)
		replace edu=edunotstudent if mi(edu)
		replace edu_14=edunotstudent if mi(edu)
		replace edu=edu_14 if mi(edu)
		tab edu edunotstudent if mi(edu),m
		tab edustudent edu if mi(edu),m
		
			gen temp = 1 if edustudent==2
				replace temp=2 if edustudent==3
				replace temp=5 if inlist(edustudent,4,5)
				replace temp=7 if inlist(edustudent,6,7,8,9,10,11)
			replace edu=temp if mi(edu)

	
	// identify variables to keep
	keep month year /// need this to identify school year
		raceomb ethnicityomb sex edu edu_14 /// demos
		occu* age num /// subgroups
		d_biep_white_good_all tblack_0to10 twhite_0to10 /// outcomes
		state countyno
		
	mdesc
	
	// final prep
	destring *, replace
	save "$data/race_iat_public_clean_2015.dta", replace		
		
// 2016
	// input data
	use "$raw/iat/IAT2016.dta", clear

	// basic cleaning
	assert regexm(study_name,"Demo.Race")==1
	
	foreach var in state countyno {
		replace `var' = trim(`var')
	}
	egen rm = rowmiss(state countyno)
	assert rm == 0 | rm == 2 
	keep if rm == 0	
	
	// Missingness
	sum month* year* tblack* twhite* *race* *ethnicity* *sex* *edu* occu* *birth* num d_biep* 
	
	egen rm2 = rowmiss(tblack_0to10 tblack_1to11)
		assert rm2!=0
	egen rm3 = rowmiss(twhite_0to10 twhite_1to11)
		assert rm3!=0
		drop rm*
		replace tblack_0to10=tblack_1to11-1 if mi(tblack_0to10)
		replace twhite_0to10=twhite_1to11-1 if mi(twhite_0to10)
		
	gen sex = "m" if birthsex==1
		replace sex="f" if birthsex==2
		replace sex ="m" if inlist(sex_5,1,4)
		replace sex="f" if inlist(sex_5,2,3)
	
	egen rm2 = rowmiss(birthmonth birthyear)
	tab rm2
	drop rm2
	
		gen mdy=mdy(month,day,year)
		gen mdy2 = mdy(birthmonth,1,birthyear)
		gen diff=mdy-mdy2
		gen temp = diff/365
		gen age = temp 
		drop mdy mdy2 diff temp
		
		egen rm2 = rowmiss(edunotstudent edustudent)
		replace edu=edunotstudent if mi(edu)
		replace edu_14=edunotstudent if mi(edu)
		replace edu=edu_14 if mi(edu)
		tab edu edunotstudent if mi(edu),m
		tab edustudent edu if mi(edu),m
		
			gen temp = 1 if edustudent==2
				replace temp=2 if edustudent==3
				replace temp=5 if inlist(edustudent,4,5)
				replace temp=7 if inlist(edustudent,6,7,8,9,10,11)
			replace edu=temp if mi(edu)

	// identify variables to keep
	keep month year /// need this to identify school year
		raceomb ethnicityomb sex edu edu_14 /// demos
		occu* age num /// subgroups
		d_biep_white_good_all tblack_0to10 twhite_0to10 /// outcomes
		state countyno
		
	mdesc
	
	// final prep
	destring *, replace
	save "$data/race_iat_public_clean_2016.dta", replace	
			
********************************************************************************
* PART 2: Prepare data to for MrP estimation
********************************************************************************

	// Create an all file
	use "$data/race_iat_public_clean_2008.dta", clear
	forvalues y = 2009/2016 { 
		append using "$data/race_iat_public_clean_`y'.dta"
		}
		
		// Explore missingness of key variables overtime
		bys year: mdesc age num raceomb sex d_biep_white_good_all tblack_0to10 twhite_0to10 edu occupation
		bys year: tab raceomb
		replace raceomb = 9 if mi(raceomb)
		
	// drop those who have taken it once before
	keep if num == "0"
		
	// identify teachers
	// these are 2006-2015
	g k12=0
	replace k12=1 if occupation=="25-2000"
	
	// 2016
	replace k12=1 if occupation_self=="25-2000"
	replace k12=1 if occuselfdetail=="25-2000"
	replace k12=1 if occupation_selfdetail_001=="25-2000"

	// account for missings
	g occmiss=(occupation==""|occupation=="." |occupation=="null") 
	g occselfmiss=(occupation_self==""|occupation_self=="."|occupation_self=="null")
	g occselfdetmiss=(occuselfdetail==""|occuselfdetail=="."|occuselfdetail=="-999"|occuselfdetail=="null")
	g occselfdet01miss=(occupation_selfdetail_001=="null"|occupation_selfdetail_001==""|occupation_selfdetail_001=="-999"|occupation_selfdetail_001==".")

	tab1 occmiss occselfmiss occselfdetmiss occselfdet01miss, m
		replace k12=. if occmiss==1 & occselfmiss==1 & occselfdetmiss==1 & occselfdet01miss==1
	
	
	gen teacher = k12
	replace teacher=. if k12==0
	gen not_teacher=k12==0
	replace not_teacher=. if mi(k12) | not_teacher==0
		
	// generate outcome variables
	gen explicitbias = twhite_0to10-tblack_0to10
	rename d_biep_white_good_all implicitbias
	
		// see https://implicit.harvard.edu/implicit/demo/background/raceinfo.html
		
	// keep only responses from the time periods that overlap with the SEDA/OCR data
	keep if inlist(year,2009,2010,2011,2012,2013,2014,2015) | (year==2008 & month >= 7) | (year==2016 & month <= 6)  // amended to inclued 2016 by dave
	gen schoolyear = year if month <= 6
		replace schoolyear = year+1 if month >= 7
							
	// keep important variables 
	keep implicitbias explicitbias state countyno  k12 teacher not_teacher schoolyear sex raceomb age edu 
	statastates, abbr(state)
	tab state if _merge==1 // Not in the 50
	drop if _merge==1
	drop _merge
	
	// Prep county var
	tostring countyno, gen(countyid)
	assert length(countyid)<=3
	replace countyid="00"+countyid if length(countyid)==1
	replace countyid="0"+countyid if length(countyid)==2
	tostring state_fips, gen(fips)
	replace fips = "0"+fips if length(fips)<2
	replace countyid=fips+countyid
	assert length(countyid)==5
	drop countyno state_name state_fips
	destring fips countyid, replace
	order fips state countyid
	
		// What proportion of respondents are teachers in counties on average?
		egen rm = rowmiss(*bias) // 70% have both
		drop rm
		bys countyid: gen count =_N
		bys countyid: egen total = total(teacher)
		gen prop = total/count
		egen tag = tag(countyid)
		sum prop if tag, detail // 3% is median. So the overall bias is going to be mainly nonteacher driven.
		drop prop tag count total

********************************************************************************
* PART 3: Save datasets
********************************************************************************
	
	// For RQ1
	preserve
		
		// Prep variables for analysis
		g female=. 
		replace female=1 if inlist(sex, "f", "F")
		replace female=0 if inlist(sex, "m", "M")

		g agecat=1 if age<=29
		replace agecat=2 if age>=30
		replace agecat=3 if age>=40
		replace agecat=4 if age>=50
		replace agecat=5 if age>=60
		replace agecat=6 if age>=70
		replace agecat =. if mi(age)

		la var agecat "age category based on age var"
		la def ag 1 "<29" 2 "30-39" 3 "40-49" 4 "50-59" 5 "60-69" 6 "70+"
		la val agecat ag

		tab raceomb, m

		tab raceomb, g(race)
		ren race1	amind
		ren race2	eastasian
		ren race3	southasian
		ren race4	nathaw
		ren race5	black
		ren race6	white
		ren race7	blackwhite
		ren race8	multiother
		ren race9	otherun

		gen ed = 1 if inlist(edu,1,2,3)
			replace ed = 2 if inlist(edu,4)
			replace ed = 3 if inlist(edu,5,6)
			replace ed = 4 if inlist(edu,7)
			replace ed = 5 if inlist(edu,8,9)
			replace ed = 6 if inlist(edu,10,11,12,13,14)

		la def ed 1 "elem/jr high, some hs" 2 "hs degree" 3 "some college, assoc" 4 "bach degree" 5 "some grad sch/master's" 6 "advanced degree"
		la val ed ed
			
		save "$data/iat", replace
	restore
	
	// For MRP

		// Recode
		gen temp = "under18" if age < 18
			replace temp = "18to29" if age > 17 & age < 30
			replace temp = "30to44" if age > 29 & age < 45
			replace temp = "45to64" if age > 44 & age < 65
			replace temp = "over65" if age > 64
			replace temp = "" if mi(age)
			drop age
			rename temp age
		replace sex = "" if !inlist(sex,"M","F","m","f")
		replace sex = lower(sex)
		gen race = "bl" if raceomb==5
			replace race = "wh" if raceomb==6
			replace race = "oth" if inlist(raceomb,1,2,3,4,7,8,9)
			drop raceomb
		
		preserve
			use http://www.stata-press.com/data/r13/educ3, clear
			keep state division
			duplicates drop
			rename state fips
			drop if mi(division)
			tempfile div
			save `div'
		restore
		merge m:1 fips using `div'
		replace division = 8 if state=="WY"
		replace division = 5 if state=="DC"
		assert !mi(division)
		drop _merge
		label drop _all
		
		saveold "$data/iat_mrp_all", replace version(12)
		
		keep if k12==1
		saveold "$data/iat_mrp_tch", replace version(12)
				
log close


